Loding important packages

library(readr,  warn.conflicts=F)
library(RColorBrewer,  warn.conflicts=F) #Rcolorbrewer palette
library(corrplot,  warn.conflicts=F)
## corrplot 0.84 loaded
library(ggcorrplot,  warn.conflicts=F)
## Loading required package: ggplot2
library(plotly,  warn.conflicts=F)
library(ggplot2, warn.conflicts=F)
library(reshape, warn.conflicts=F)
library(viridis, warn.conflicts=F)
## Loading required package: viridisLite
library(tidyverse, warn.conflicts=F)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.5     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ purrr   0.3.4     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks reshape::expand()
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::rename() masks reshape::rename(), plotly::rename()
library(hrbrthemes, warn.conflicts=F)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(psych, warn.conflicts=F)
library(class, warn.conflicts=F)
library(caret, warn.conflicts = F)
## Loading required package: lattice
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following objects are masked from 'package:psych':
## 
##     AUC, ICC, SD
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:psych':
## 
##     alpha
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(caret)
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## The following object is masked from 'package:reshape':
## 
##     expand
set.seed(123456789)

Importing Dataset

data <- read_csv("~/Downloads/data.csv")
## Warning: Missing column names filled in: 'X33' [33]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   diagnosis = col_character(),
##   X33 = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 569 parsing failures.
## row col   expected     actual                   file
##   1  -- 33 columns 32 columns '~/Downloads/data.csv'
##   2  -- 33 columns 32 columns '~/Downloads/data.csv'
##   3  -- 33 columns 32 columns '~/Downloads/data.csv'
##   4  -- 33 columns 32 columns '~/Downloads/data.csv'
##   5  -- 33 columns 32 columns '~/Downloads/data.csv'
## ... ... .......... .......... ......................
## See problems(...) for more details.
data
## # A tibble: 569 x 33
##        id diagnosis radius_mean texture_mean perimeter_mean area_mean
##     <dbl> <chr>           <dbl>        <dbl>          <dbl>     <dbl>
##  1 8.42e5 M                18.0         10.4          123.      1001 
##  2 8.43e5 M                20.6         17.8          133.      1326 
##  3 8.43e7 M                19.7         21.2          130       1203 
##  4 8.43e7 M                11.4         20.4           77.6      386.
##  5 8.44e7 M                20.3         14.3          135.      1297 
##  6 8.44e5 M                12.4         15.7           82.6      477.
##  7 8.44e5 M                18.2         20.0          120.      1040 
##  8 8.45e7 M                13.7         20.8           90.2      578.
##  9 8.45e5 M                13           21.8           87.5      520.
## 10 8.45e7 M                12.5         24.0           84.0      476.
## # … with 559 more rows, and 27 more variables: smoothness_mean <dbl>,
## #   compactness_mean <dbl>, concavity_mean <dbl>, `concave points_mean` <dbl>,
## #   symmetry_mean <dbl>, fractal_dimension_mean <dbl>, radius_se <dbl>,
## #   texture_se <dbl>, perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## #   compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## #   symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## #   texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## #   smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## #   `concave points_worst` <dbl>, symmetry_worst <dbl>,
## #   fractal_dimension_worst <dbl>, X33 <chr>

Looking at dataset

head(data)
## # A tibble: 6 x 33
##       id diagnosis radius_mean texture_mean perimeter_mean area_mean
##    <dbl> <chr>           <dbl>        <dbl>          <dbl>     <dbl>
## 1 8.42e5 M                18.0         10.4          123.      1001 
## 2 8.43e5 M                20.6         17.8          133.      1326 
## 3 8.43e7 M                19.7         21.2          130       1203 
## 4 8.43e7 M                11.4         20.4           77.6      386.
## 5 8.44e7 M                20.3         14.3          135.      1297 
## 6 8.44e5 M                12.4         15.7           82.6      477.
## # … with 27 more variables: smoothness_mean <dbl>, compactness_mean <dbl>,
## #   concavity_mean <dbl>, `concave points_mean` <dbl>, symmetry_mean <dbl>,
## #   fractal_dimension_mean <dbl>, radius_se <dbl>, texture_se <dbl>,
## #   perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## #   compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## #   symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## #   texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## #   smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## #   `concave points_worst` <dbl>, symmetry_worst <dbl>,
## #   fractal_dimension_worst <dbl>, X33 <chr>

Columns in dataset

colnames(data)
##  [1] "id"                      "diagnosis"              
##  [3] "radius_mean"             "texture_mean"           
##  [5] "perimeter_mean"          "area_mean"              
##  [7] "smoothness_mean"         "compactness_mean"       
##  [9] "concavity_mean"          "concave points_mean"    
## [11] "symmetry_mean"           "fractal_dimension_mean" 
## [13] "radius_se"               "texture_se"             
## [15] "perimeter_se"            "area_se"                
## [17] "smoothness_se"           "compactness_se"         
## [19] "concavity_se"            "concave points_se"      
## [21] "symmetry_se"             "fractal_dimension_se"   
## [23] "radius_worst"            "texture_worst"          
## [25] "perimeter_worst"         "area_worst"             
## [27] "smoothness_worst"        "compactness_worst"      
## [29] "concavity_worst"         "concave points_worst"   
## [31] "symmetry_worst"          "fractal_dimension_worst"
## [33] "X33"

Checking for null values

##lapply(data,function(x) { length(which(is.na(x)))})
skimr::skim(data) ##Among 2 character variables, no missing values found for diagnosis variable, found 569 missing values found for X33 character variable and  found 31 numeric variable
Data summary
Name data
Number of rows 569
Number of columns 33
_______________________
Column type frequency:
character 2
numeric 31
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
diagnosis 0 1 1 1 0 2 0
X33 569 0 NA NA 0 0 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 30371831.43 125020585.61 8670.00 869218.00 906024.00 8813129.00 911320502.00 ▇▁▁▁▁
radius_mean 0 1 14.13 3.52 6.98 11.70 13.37 15.78 28.11 ▂▇▃▁▁
texture_mean 0 1 19.29 4.30 9.71 16.17 18.84 21.80 39.28 ▃▇▃▁▁
perimeter_mean 0 1 91.97 24.30 43.79 75.17 86.24 104.10 188.50 ▃▇▃▁▁
area_mean 0 1 654.89 351.91 143.50 420.30 551.10 782.70 2501.00 ▇▃▂▁▁
smoothness_mean 0 1 0.10 0.01 0.05 0.09 0.10 0.11 0.16 ▁▇▇▁▁
compactness_mean 0 1 0.10 0.05 0.02 0.06 0.09 0.13 0.35 ▇▇▂▁▁
concavity_mean 0 1 0.09 0.08 0.00 0.03 0.06 0.13 0.43 ▇▃▂▁▁
concave points_mean 0 1 0.05 0.04 0.00 0.02 0.03 0.07 0.20 ▇▃▂▁▁
symmetry_mean 0 1 0.18 0.03 0.11 0.16 0.18 0.20 0.30 ▁▇▅▁▁
fractal_dimension_mean 0 1 0.06 0.01 0.05 0.06 0.06 0.07 0.10 ▆▇▂▁▁
radius_se 0 1 0.41 0.28 0.11 0.23 0.32 0.48 2.87 ▇▁▁▁▁
texture_se 0 1 1.22 0.55 0.36 0.83 1.11 1.47 4.88 ▇▅▁▁▁
perimeter_se 0 1 2.87 2.02 0.76 1.61 2.29 3.36 21.98 ▇▁▁▁▁
area_se 0 1 40.34 45.49 6.80 17.85 24.53 45.19 542.20 ▇▁▁▁▁
smoothness_se 0 1 0.01 0.00 0.00 0.01 0.01 0.01 0.03 ▇▃▁▁▁
compactness_se 0 1 0.03 0.02 0.00 0.01 0.02 0.03 0.14 ▇▃▁▁▁
concavity_se 0 1 0.03 0.03 0.00 0.02 0.03 0.04 0.40 ▇▁▁▁▁
concave points_se 0 1 0.01 0.01 0.00 0.01 0.01 0.01 0.05 ▇▇▁▁▁
symmetry_se 0 1 0.02 0.01 0.01 0.02 0.02 0.02 0.08 ▇▃▁▁▁
fractal_dimension_se 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.03 ▇▁▁▁▁
radius_worst 0 1 16.27 4.83 7.93 13.01 14.97 18.79 36.04 ▆▇▃▁▁
texture_worst 0 1 25.68 6.15 12.02 21.08 25.41 29.72 49.54 ▃▇▆▁▁
perimeter_worst 0 1 107.26 33.60 50.41 84.11 97.66 125.40 251.20 ▇▇▃▁▁
area_worst 0 1 880.58 569.36 185.20 515.30 686.50 1084.00 4254.00 ▇▂▁▁▁
smoothness_worst 0 1 0.13 0.02 0.07 0.12 0.13 0.15 0.22 ▂▇▇▂▁
compactness_worst 0 1 0.25 0.16 0.03 0.15 0.21 0.34 1.06 ▇▅▁▁▁
concavity_worst 0 1 0.27 0.21 0.00 0.11 0.23 0.38 1.25 ▇▅▂▁▁
concave points_worst 0 1 0.11 0.07 0.00 0.06 0.10 0.16 0.29 ▅▇▅▃▁
symmetry_worst 0 1 0.29 0.06 0.16 0.25 0.28 0.32 0.66 ▅▇▁▁▁
fractal_dimension_worst 0 1 0.08 0.02 0.06 0.07 0.08 0.09 0.21 ▇▃▁▁▁

We can notice, that there seems to be three category in dataset. They’re: mean, se and worst

DATA WRANGLING Deleting X column as it seems to be a mistake while importing the dataset

drops <- c("X33")
data <- data[ , !(names(data) %in% drops)]
data
## # A tibble: 569 x 32
##        id diagnosis radius_mean texture_mean perimeter_mean area_mean
##     <dbl> <chr>           <dbl>        <dbl>          <dbl>     <dbl>
##  1 8.42e5 M                18.0         10.4          123.      1001 
##  2 8.43e5 M                20.6         17.8          133.      1326 
##  3 8.43e7 M                19.7         21.2          130       1203 
##  4 8.43e7 M                11.4         20.4           77.6      386.
##  5 8.44e7 M                20.3         14.3          135.      1297 
##  6 8.44e5 M                12.4         15.7           82.6      477.
##  7 8.44e5 M                18.2         20.0          120.      1040 
##  8 8.45e7 M                13.7         20.8           90.2      578.
##  9 8.45e5 M                13           21.8           87.5      520.
## 10 8.45e7 M                12.5         24.0           84.0      476.
## # … with 559 more rows, and 26 more variables: smoothness_mean <dbl>,
## #   compactness_mean <dbl>, concavity_mean <dbl>, `concave points_mean` <dbl>,
## #   symmetry_mean <dbl>, fractal_dimension_mean <dbl>, radius_se <dbl>,
## #   texture_se <dbl>, perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## #   compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## #   symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## #   texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## #   smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## #   `concave points_worst` <dbl>, symmetry_worst <dbl>,
## #   fractal_dimension_worst <dbl>


Finally, we got rid of all the missing values, so the modified data is ready to use for further analysis.


Let’s looking into correlation matrix to see correlation between all the variables

matrixData <- cor(data[sapply(data,is.numeric)], method="pearson")
# Rcolorbrewer palette
coul <- colorRampPalette(brewer.pal(8, "PiYG"))(25)
heatmap(matrixData, scale="column", col = coul)

corrplot(matrixData, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)

#data <- sapply(data,is.numeric)
data.mean <- cor(data[,c(3:12)],method="pearson")
data.se <- cor(data[,c(13:22)],method="pearson")
data.worst <- cor(data[,c(23:32)],method="pearson")


corrplot(data.mean, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)

corrplot(data.se, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)

corrplot(data.worst, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)


table(data$diagnosis)
## 
##   B   M 
## 357 212
count(data, diagnosis) %>% mutate(relative_freq = (n/sum(n))) -> relative_freq
relative_freq
## # A tibble: 2 x 3
##   diagnosis     n relative_freq
##   <chr>     <int>         <dbl>
## 1 B           357         0.627
## 2 M           212         0.373
ggplot(data, aes(x=as.factor(diagnosis), fill=as.factor(diagnosis) )) + 
  geom_bar() +
  scale_fill_brewer(palette = "Set1") +
  theme(legend.position="none") + labs(title= "Barplot representing two different tumors")

pairs.panels(data[,c(3:12)], main="Cancer Mean")

pairs.panels(data[,c(13:22)], main="Cancer SE")

pairs.panels(data[,c(23:32)], main="Cancer Worst")

Now we will construct 9 different violin plots for radius, perimeter and area of the spread of tumor in the body of patient based on the mean, se and worst.

##Violin plot representing Radius Mean distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = radius_mean)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 2) +
labs(title = "Radius Mean distribution by diagnosis")

##Violin plot representing Radius_Se distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = radius_se)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 2) +
labs(title = "Radius Se distribution by diagnosis")

##Violin plot representing Radius worst distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = radius_worst)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Radius Worst distribution by diagnosis")

##Violin plot representing area Mean distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = area_mean)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Area Mean distribution by diagnosis")

##Violin plot representing area se distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = area_se)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Area_Se distribution by diagnosis")

##Violin plot representing area worst distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = area_worst)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Area Worst distribution by diagnosis")

##Violin plot representing perimeter_Se distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = perimeter_se)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Perimeter_se distribution by diagnosis")

##Violin plot representing perimeter Mean distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = perimeter_mean)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "perimeter mean distribution by diagnosis")

##Violin plot representing perimeter worst distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = perimeter_worst)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Perimeter Worst distribution by diagnosis")


Let’s split the data now to see how tumors differ for M and B

cancer_split <- split(data, data$diagnosis)
##cancer_train <- training(cancer_split)
##cancer_test <- testing(cancer_split)
dataB <- cancer_split$B
dataM <- cancer_split$M
ggplot(data, aes(x=as.factor(diagnosis), fill=as.factor(diagnosis) )) + 
  geom_bar() + labs(title= "Barplot representing two different tumors")


Now we have two different datasets for B and M